Below I've taken income statement data for SP500 stocks for the last four years and attempted to see if there is an easily defined relationship between changes to income statement attributes like Total Revenue, Net Income, or EBITDA and the resulting three month change in stock price. It is safe to say that there is no easily defined relationship, and some extraneous set of variables is responsible for changes in stock price.
import requests
import requests
import pandas as pd
import pickle
import os
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import time
import numpy as np
import plotly.io as pio
from tqdm import tqdm
from datetime import datetime
from dateutil.relativedelta import relativedelta
pd.options.mode.chained_assignment = None
pio.renderers.default='notebook'
df_sp500 = pd.read_csv('sp500_tickers.csv')
tickers = df_sp500['Symbol'].values
alpha_api_key = '296ULICGSB63VL7A'
sp500_close_fileo = open('sp500_close.pickle', 'rb')
close_datao = pickle.load(sp500_close_fileo)
inc_dfs = []
cls_dfs = []
for i in tqdm(range(len(tickers))):
t = tickers[i]
time.sleep(.85)
url = 'https://www.alphavantage.co/query?function=INCOME_STATEMENT&symbol='+t+'&apikey='+alpha_api_key
r = requests.get(url)
data = r.json()
if 'annualReports' in data and 'Monthly Time Series' in close_datao[t]:
inc_df = pd.DataFrame.from_dict(data['annualReports'])
inc_df['stock'] = t
close_data_keys = list(close_datao.keys())
close_df = pd.DataFrame.from_dict(close_datao[t]['Monthly Time Series']).T
close_df['stock'] = t
close_df.index = pd.to_datetime(close_df.index)
inc_df['fiscalDateEnding'] = pd.to_datetime(inc_df['fiscalDateEnding'])
dates = inc_df['fiscalDateEnding'].values
# create a new date column that is 120 days out from the the income sheet date
inc_df['dateplus'] = inc_df['fiscalDateEnding'] + np.timedelta64(120,'D')
# create a new column and set to none
inc_df['120D_stock_change'] = None
for index, row in inc_df.iterrows():
begin_date = row['fiscalDateEnding']
end_date = row['dateplus']
new_df = close_df[(close_df.index > begin_date) & (close_df.index <= end_date)]
if not new_df.empty:
#print(new_df)
end_cls = float(new_df['4. close'][0])
strt_cls = float(new_df['4. close'][-1])
cls_cng = (end_cls-strt_cls)/end_cls
# update income sheet table
inc_df.loc[index,'120D_stock_change'] = cls_cng
cls_dfs.append(close_df)
inc_dfs.append(inc_df)
close_df = pd.concat(cls_dfs)
inc_df = pd.concat(inc_dfs)
100%|██████████| 504/504 [10:32<00:00, 1.26s/it]
inc_df.to_pickle('income_statement_df.pkl')
close_df.to_pickle('close_data_income_statement.pkl')
inc_df = inc_df.dropna()
inc_df = inc_df[inc_df['ebitda']!="None"]
inc_df = inc_df[inc_df['totalRevenue']!="None"]
sm_df = inc_df[['stock','totalRevenue','ebitda','netIncome','120D_stock_change']]
sm_df['ebitda'].replace
sm_df['ebitda'] = pd.to_numeric(sm_df['ebitda'])
sm_df['netIncome'] = pd.to_numeric(sm_df['netIncome'])
sm_df['totalRevenue'] = pd.to_numeric(sm_df['totalRevenue'])
sm_df[['ebitda','netIncome','totalRevenue']] = sm_df[['ebitda','netIncome','totalRevenue']].pct_change()
sm_df[['ebitda','netIncome','totalRevenue']] = sm_df[['ebitda','netIncome','totalRevenue']].fillna(0)
sm_df.head()
| stock | totalRevenue | ebitda | netIncome | 120D_stock_change | |
|---|---|---|---|---|---|
| 0 | BAC | 0.000000 | 0.000000 | 0.000000 | -0.293161 |
| 1 | BAC | -0.040230 | -0.295824 | -0.440428 | 0.268443 |
| 2 | BAC | 0.066832 | 1.021270 | 0.532916 | -0.546397 |
| 3 | BAC | -0.002455 | -0.026842 | 0.026139 | 0.068999 |
| 4 | BAC | -0.042782 | -0.215061 | -0.352258 | -0.069519 |
fig = px.scatter_matrix(sm_df,
dimensions=['ebitda','totalRevenue','netIncome','120D_stock_change'])
fig.show()
import sklearn
from sklearn.linear_model import LinearRegression
rv_df = sm_df[['totalRevenue','120D_stock_change']]
rv_df['totalRevenue'].describe()
count 2458.000000 mean 0.571654 std 5.420123 min -0.988559 25% -0.147605 50% -0.055954 75% 0.032141 max 161.269663 Name: totalRevenue, dtype: float64
std = rv_df['totalRevenue'].std()
# drop everything over two standard deviations - we'll call these outliers
rv_df = rv_df[rv_df['totalRevenue'] < (std*2)]
rv_df = rv_df[rv_df['totalRevenue'] > (std*-2)]
std = rv_df['120D_stock_change'].std()
# drop everything over two standard deviations - we'll call these outliers
rv_df = rv_df[rv_df['120D_stock_change'] < (std*2)]
rv_df = rv_df[rv_df['120D_stock_change'] > (std*-2)]
from sklearn.cluster import KMeans
X = rv_df[['totalRevenue','120D_stock_change']]
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
kmeans.labels_
X = rv_df[['totalRevenue']]
y = rv_df[['120D_stock_change']]
reg = LinearRegression().fit(X,y)
e = reg.score(X, y)
x_range = np.linspace(X['totalRevenue'].min(),X['totalRevenue'].max())
reg_y = x_range * e
fig = px.scatter(rv_df,x='totalRevenue',y='120D_stock_change',color=kmeans.labels_,trendline='ols')
fig.show()
ni_df = sm_df[['netIncome','120D_stock_change']]
ni_df['netIncome'].describe()
count 2458.000000 mean 0.427088 std 26.185624 min -456.575758 25% -0.466606 50% -0.136116 75% 0.251767 max 1013.000000 Name: netIncome, dtype: float64
std = ni_df['netIncome'].std()
# drop everything over two standard deviations - we'll call these outliers
ni_df = ni_df[ni_df['netIncome'] < (std*2)]
ni_df = ni_df[ni_df['netIncome'] > (std*-2)]
std = ni_df['120D_stock_change'].std()
# drop everything over two standard deviations - we'll call these outliers
ni_df = ni_df[ni_df['120D_stock_change'] < (std*2)]
ni_df = ni_df[ni_df['120D_stock_change'] > (std*-2)]
X = ni_df[['netIncome','120D_stock_change']]
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
kmeans.labels_
X = ni_df[['netIncome']]
y = ni_df[['120D_stock_change']]
reg = LinearRegression().fit(X,y)
e = reg.score(X, y)
x_range = np.linspace(X['netIncome'].min(),X['netIncome'].max())
reg_y = x_range * e
fig = px.scatter(ni_df,x='netIncome',y='120D_stock_change',color=kmeans.labels_,trendline='ols')
fig.show()
ebitda_df = sm_df[['ebitda','120D_stock_change']]
ebitda_df['ebitda'].describe()
count 2458.000000 mean 0.210914 std 8.120435 min -181.962963 25% -0.286693 50% -0.094540 75% 0.167067 max 270.464325 Name: ebitda, dtype: float64
std = ebitda_df['ebitda'].std()
print('standard deviation is ', std)
# drop everything over two standard deviations - we'll call these outliers
ebitda_df = ebitda_df[ebitda_df['ebitda'] < (std*2)]
ebitda_df = ebitda_df[ebitda_df['ebitda'] > (std*-2)]
ebitda_df = ebitda_df[ebitda_df['120D_stock_change'] < (std*2)]
ebitda_df = ebitda_df[ebitda_df['120D_stock_change'] > (std*-2)]
standard deviation is 1.7754071466865673
ebitda_dist = px.histogram(ebitda_df,
x='ebitda',
marginal="box",
nbins=60)
ebitda_dist.show()
X = ebitda_df[['ebitda','120D_stock_change']]
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
kmeans.labels_
X = ebitda_df[['ebitda']]
y = ebitda_df[['120D_stock_change']]
reg = LinearRegression().fit(X,y)
e = reg.score(X, y)
x_range = np.linspace(X['ebitda'].min(),X['ebitda'].max())
reg_y = x_range * e
fig = px.scatter(ebitda_df,x='ebitda',y='120D_stock_change',color=kmeans.labels_,trendline='ols')
fig.show()